Code
knitr::opts_chunk$set(comment = NA)
library(janitor)
library(stringr)
library(naniar)
library(broom)
library(rms)
library(GGally)
library(kableExtra)
library(caret)
library(pROC)
library(tidyverse)
theme_set(theme_bw())knitr::opts_chunk$set(comment = NA)
library(janitor)
library(stringr)
library(naniar)
library(broom)
library(rms)
library(GGally)
library(kableExtra)
library(caret)
library(pROC)
library(tidyverse)
theme_set(theme_bw())This data is from the “Tidy Tuesday archive” datasets on Github, which can be found here. The data dictionary can also be found here. The data was gathered by the United States Environmental Protection Agency (EPA) via the “EPA’s National Vehicle and Fuel Emissions Laboratory in Ann Arbor, Michigan, and by vehicle manufacturers who submit their own test data to EPA”. With this, there was no sampling strategy and the data was collected to have more information about cars’ fuel economy estimates.
The subjects of our data describe various car models from various manufacturers and they were sampled based on when the cars were tested for EPA compliance.
# load data
data <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv", show_col_types = FALSE)Here, we ingest our data from the URL where the dateset is.
There are a number of steps that we have to take to get our dataset prepared, beginning with getting a subset of variables from the original dataset. These variables represent our predictor, outcome, and identifying variables. We also want to create a new variable, decade, which will represent the decade when the vehicle was recorded.
# subset variables
data <- data |>
select(id, make, model, year, comb08, cylinders, displ,
drive, fuelCost08,fuelType1, trany, tCharger) |>
clean_names()
# create decade variable
data <- data |>
mutate(decade = case_when(year < 1990 ~ "1980s",
1990 <= year & year < 2000 ~ "1990s",
2000 <= year & year < 2010 ~ "2000s",
2010 <= year & year < 2020 ~ "2010s",
TRUE ~ "2020s"))We also want to change the names of some of our variables to be more concise and descriptive.
# change variable names and drop original category variables
data <- data |>
mutate(mpg = comb08) |>
mutate(displacement = displ) |>
mutate(yearly_fuel_cost = fuel_cost08) |>
mutate(fuel_type = fuel_type1) |>
mutate(transmission = trany) |>
mutate(is_turbo = t_charger) |>
select(id, make, model, year, decade, mpg,
cylinders, displacement, drive,
yearly_fuel_cost, fuel_type,
transmission, is_turbo)Some of the variables’ data type also have to be appropriately adjusted, so we will do so accordingly.
data <- data |>
mutate(id = as.character(id)) |>
mutate(year = as.character(year)) |>
mutate(cylinders = as.factor(cylinders)) |>
mutate(drive = as.factor(drive)) |>
mutate(fuel_type = as.factor(fuel_type)) |>
mutate(transmission = as.factor(transmission)) |>
mutate(is_turbo = as.factor(is_turbo)) |>
mutate(decade = as.factor(decade))Another thing that we have to do is make some variable values more descriptive and usable by the models.
# change value options to only Manual or Automatic
data$transmission <- word(data$transmission, 1)
data <- data |>
mutate(transmission = as.factor(transmission))
# turn NA values into 0
data <- data |>
mutate(is_turbo = case_when(is.na(is_turbo) ~ 0, TRUE ~ 1)) |>
mutate(is_turbo = as.factor(is_turbo))We also want to filter out some data from the original dataset so we have a more precise pool of car models. Lastly, we will set a seed and then use slice_sample() to subset our data because the full dataset is too large.
# keep most common number of cylinders
data <- data |>
filter(cylinders == 4 |
cylinders == 6 |
cylinders == 8 |
is.na(cylinders))
# remove two wheel drive as not very descriptive -> can be front or rear
data <- data |>
filter(drive != "2-Wheel Drive")
# only keep models that run on Regular/Premium Gasoline
data <- data |>
filter(fuel_type == "Regular Gasoline" |
fuel_type == "Premium Gasoline")
# set seed for sampling
set.seed(432)
# get subset of data
epa_data <- data |>
slice_sample(n = 1200)
# check dimensions of data
dim(epa_data)[1] 1200 13
Here, we will ensure that each of our categorical predictor variables meet specifications.
summary(epa_data$cylinders) 2 3 4 5 6 8 10 12 16
0 0 490 0 453 257 0 0 0
summary(epa_data$drive) 2-Wheel Drive 4-Wheel Drive
0 44
4-Wheel or All-Wheel Drive All-Wheel Drive
195 94
Front-Wheel Drive Part-time 4-Wheel Drive
464 9
Rear-Wheel Drive
394
summary(epa_data$fuel_type) Diesel Electricity Midgrade Gasoline Natural Gas
0 0 0 0
Premium Gasoline Regular Gasoline
342 858
summary(epa_data$transmission)Automatic Manual
855 345
summary(epa_data$is_turbo) 0 1
1014 186
summary(epa_data$decade)1980s 1990s 2000s 2010s 2020s
218 275 312 371 24
We can see that for drive, there are only 9 models that are “Part-time 4-Wheel Drive” in, so we will just drop these models because they aren’t very common nowadays and they also don’t represent much of the data.
epa_data <- epa_data |>
filter(drive != "Part-time 4-Wheel Drive")We can also see that we have 24 vehicles that were made in the 2020s, so we will collapse this group with vehicles made in the 2010s.
epa_data <- epa_data |>
mutate(decade = case_when(year < 1990 ~ "1980s",
1990 <= year & year < 2000 ~ "1990s",
2000 <= year & year < 2010 ~ "2000s",
TRUE ~ "2010+"))
epa_data <- epa_data |>
mutate(decade = as.factor(decade))Because we removed observations of certain categories, we want to drop these unused ones. To do so, we will utilize droplevels().
epa_data <- epa_data |>
mutate(cylinders = droplevels(epa_data$cylinders)) |>
mutate(drive = droplevels(epa_data$drive)) |>
mutate(fuel_type = droplevels(epa_data$fuel_type)) We already have our id variable on the far left, but we want to move our outcome variables to the far right for readability.
epa_data <- epa_data |>
select(id, make, model, year, decade, mpg,
cylinders, displacement, drive,
fuel_type, is_turbo,
transmission, yearly_fuel_cost)Here, we will list our final tidy dataset, which has 1,191 rows and 13 columns.
epa_data# A tibble: 1,191 × 13
id make model year decade mpg cylin…¹ displ…² drive fuel_…³ is_tu…⁴
<chr> <chr> <chr> <chr> <fct> <dbl> <fct> <dbl> <fct> <fct> <fct>
1 12530 Suzuki X-90… 1996 1990s 24 4 1.6 Rear… Regula… 0
2 2089 Volvo 240 … 1986 1980s 22 4 2.3 Rear… Regula… 0
3 25287 Hyundai Sona… 2009 2000s 25 4 2.4 Fron… Regula… 0
4 12930 Honda Odys… 1996 1990s 19 4 2.2 Fron… Regula… 0
5 31559 Audi R8 2012 2010+ 16 8 4.2 All-… Premiu… 0
6 25184 Chevrol… Coba… 2008 2000s 25 4 2 Fron… Premiu… 1
7 21294 GMC Envo… 2005 2000s 15 6 4.2 Rear… Regula… 0
8 3998 GMC S15 … 1987 1980s 20 4 2.5 Rear… Regula… 0
9 13696 GMC Sono… 1997 1990s 20 4 2.2 Rear… Regula… 0
10 22120 Suzuki Aeri… 2006 2000s 23 4 2.3 4-Wh… Regula… 0
# … with 1,181 more rows, 2 more variables: transmission <fct>,
# yearly_fuel_cost <dbl>, and abbreviated variable names ¹cylinders,
# ²displacement, ³fuel_type, ⁴is_turbo
# get number of unique id values
length(unique(epa_data$id))[1] 1191
# get number of rows in data
dim(epa_data)[1] 1191 13
# get type of id variable
class(epa_data$id)[1] "character"
We will now check to see if there are the correct number of unique identifiers in id by comparing its value to the number of rows. We can also see that id is of the character type and that there are 1,191 rows and 13 variables. Since there are also 1,191 unique id values, we know that each row has a unique identifier.
saveRDS(epa_data, "/Users/mtjen/desktop/432/projectA/epa_data.Rds")Here, we will save our tidied tibble as an R dataset.
The following is our code book to help define variables in our data.
| Variable | Role | Type | Description |
|---|---|---|---|
id |
identifier | character | vehicle identifier |
make |
identifier | character | vehicle company/manufacturer |
model |
identifier | character | vehicle model |
year |
identifier | character | vehicle year |
decade |
input | categorical (4) | vehicle decade [“1980s”, “1990s”, “2000s”, “2010+”] |
mpg |
input | quantitative | combined miles per gallon |
cylinders |
input | categorical (3) | number of engine cylinders [4, 6, 8] |
displacement |
input | quantitative | engine displacement (liters) |
drive |
input | categorical (5) | type of drive axle [“4-Wheel Drive”, “4-Wheel or All-Wheel Drive”, “All-Wheel Drive”, “Front-Wheel Drive”, “Rear-Wheel Drive”] |
fuel_type |
input | categorical (2) | type of fuel [“Premium Gasoline”, “Regular Gasoline”] |
is_turbo |
input | categorical (2) | whether or not the vehicle is turbocharged [0, 1] |
transmission |
input, outcome | categorical (2) | type of transmission [“Automatic”, “Manual”] |
yearly_fuel_cost |
outcome | quantitative | yearly cost of fuel |
Next, we will look at quick numerical summaries/descriptions of our variables.
Hmisc::describe(epa_data)epa_data
13 Variables 1191 Observations
--------------------------------------------------------------------------------
id
n missing distinct
1191 0 1191
lowest : 10012 10026 10032 10047 10075, highest: 9709 972 9813 9820 9926
--------------------------------------------------------------------------------
make
n missing distinct
1191 0 62
lowest : Acura Alfa Romeo American Motors Corporation Audi Bentley
highest: Volkswagen Volvo VPG Wallace Environmental Yugo
--------------------------------------------------------------------------------
model
n missing distinct
1191 0 840
lowest : 135i 135i Convertible 1500 2WD 190E 2.3 2
highest: Yukon 1500 4WD Yukon C1500 2WD Z4 3.0si Z4 Roadster Z4 sDrive28i
--------------------------------------------------------------------------------
year
n missing distinct
1191 0 37
lowest : 1984 1985 1986 1987 1988, highest: 2016 2017 2018 2019 2020
--------------------------------------------------------------------------------
decade
n missing distinct
1191 0 4
Value 1980s 1990s 2000s 2010+
Frequency 218 275 312 386
Proportion 0.183 0.231 0.262 0.324
--------------------------------------------------------------------------------
mpg
n missing distinct Info Mean Gmd .05 .10
1191 0 36 0.995 20.31 5.437 13 15
.25 .50 .75 .90 .95
17 20 23 26 29
lowest : 8 10 11 12 13, highest: 43 44 46 50 52
--------------------------------------------------------------------------------
cylinders
n missing distinct
1191 0 3
Value 4 6 8
Frequency 490 448 253
Proportion 0.411 0.376 0.212
--------------------------------------------------------------------------------
displacement
n missing distinct Info Mean Gmd .05 .10
1191 0 53 0.997 3.225 1.412 1.6 1.8
.25 .50 .75 .90 .95
2.2 3.0 4.0 5.2 5.7
lowest : 1.2 1.3 1.4 1.5 1.6, highest: 6.2 6.4 6.5 6.8 7.4
--------------------------------------------------------------------------------
drive
n missing distinct
1191 0 5
lowest : 4-Wheel Drive 4-Wheel or All-Wheel Drive All-Wheel Drive Front-Wheel Drive Rear-Wheel Drive
highest: 4-Wheel Drive 4-Wheel or All-Wheel Drive All-Wheel Drive Front-Wheel Drive Rear-Wheel Drive
4-Wheel Drive (44, 0.037), 4-Wheel or All-Wheel Drive (195, 0.164), All-Wheel
Drive (94, 0.079), Front-Wheel Drive (464, 0.390), Rear-Wheel Drive (394,
0.331)
--------------------------------------------------------------------------------
fuel_type
n missing distinct
1191 0 2
Value Premium Gasoline Regular Gasoline
Frequency 342 849
Proportion 0.287 0.713
--------------------------------------------------------------------------------
is_turbo
n missing distinct
1191 0 2
Value 0 1
Frequency 1006 185
Proportion 0.845 0.155
--------------------------------------------------------------------------------
transmission
n missing distinct
1191 0 2
Value Automatic Manual
Frequency 846 345
Proportion 0.71 0.29
--------------------------------------------------------------------------------
yearly_fuel_cost
n missing distinct Info Mean Gmd .05 .10
1191 0 44 0.997 2222 638 1400 1600
.25 .50 .75 .90 .95
1800 2150 2600 3050 3300
lowest : 750 800 850 900 950, highest: 3800 4000 4100 4500 4950
--------------------------------------------------------------------------------
Given vehicle time frame, engine properties, and transmission information, can we predict the yearly fuel cost of the vehicle?
The outcome variable that we will be using is yearly_fuel_cost and I am interested in this because of how in general, companies are trying to make cars as fuel efficient as possible nowadays. With that, the fuel cost should in theory be decreasing over time. In particular, I would like to see if various vehicle properties excluding miles per gallon will help to accurately predict what the cost will be.
nrow(epa_data |>
filter(complete.cases(yearly_fuel_cost)))[1] 1191
As we can see, there are 1,191 rows with complete information in yearly_fuel_cost, which means we don’t have any rows with a missing outcome observation.
ggplot(epa_data, aes(x = yearly_fuel_cost)) +
geom_histogram(bins = 20, fill = "lightgreen", col = "white") +
labs(title = "Distribution of Yearly Fuel Cost",
x = "Yearly Fuel Cost",
y = "Number of Vehicles")Based on this histogram, we can see that the distribution of yearly_fuel_cost is pretty symmetric and normal, with a slight right skew. The values are fairly continuous and there doesn’t appear to be an obvious natural transformation to consider right now
length(unique(epa_data$yearly_fuel_cost))[1] 44
As we can see, we have 44 distinct yearly_fuel_cost values, which clears the definition of a quantitative variable.
The predictor variables we intend to use for the linear regression model are decade, cylinders, displacement, drive, fuel_type, is_turbo, and transmission.
length(unique(epa_data$displacement))[1] 53
As we can see, we have 53 distinct displacement values, which clears the definition of a quantitative variable.
summary(epa_data$cylinders) 4 6 8
490 448 253
As we can see, cylinders is a multi-categorical variable, as there are three levels, each with at least 30 observations.
complete_outcome_rows <- 1191
7 < 4 + (complete_outcome_rows - 100) / 100[1] TRUE
Here, we can see that the total number of predictor variables (7) is appropriate relative to the amount of rows we are going to use.
For our predictors, there are some intuitive predictions we can make regarding the expected direction of relationships with the yearly_fuel_cost. Off the bat, less cylinders and lower displacement will likely lead to less fuel usage, so a lower cost. fuel_type can also be logically guessed, as premium gas is more expensive than regular. Regarding decades, I would guess that newer cars will have lower costs as cars have become more fuel efficient over time. I have no idea which way drive will go, and similarly for is_turbo, I’m not really sure but I think that the cost will be less if the car has a turbo. Lastly, I would guess that automatic cars are more fuel efficient than manual cars as the shift points are optimized in theory.
Given vehicle time frame, fuel consumption information, engine properties, and transmission information, can we predict if the vehicle has an automatic or manual transmission?
The outcome variable for our logistic regression model is transmission, which specifies if a car is an automatic or manual. I’m interested in this because it’s generally believed that automatics are more fuel efficient than manuals and I’d like to investigate if this is a more general fact or more model specific.
summary(epa_data$transmission)Automatic Manual
846 345
Here, we can see that in our data we have 846 cars that are automatic and 345 that are manual.
The predictor variables we intend to use for the model are decade, mpg, cylinders, displacement, drive, and is_turbo. Most of these are the same our linear regression model, with the addition of mpg, which specifies the vehicle’s combined miles per gallon.
length(unique(epa_data$mpg))[1] 36
As we can see, we have 36 distinct mpg values, which clears the definition of a quantitative variable.
smaller_group_rows <- 345
6 < 4 + (smaller_group_rows - 100) / 100[1] TRUE
As we can see, the number of predictor variables for our model is ok.
I’m not sure how cylinders, displacement, drive, and is_turbo are going to relate to transmission, but I think there’s a logical idea for decade and mpg. For decade, I would expect the number of automatics to increase as more time passes as the number of manual cars have been decreasing. For mpg, I think that manual cars will generally have lower values than if it were an automatic, as the shift points aren’t as optimized for fuel efficiency.
miss_var_summary(epa_data)# A tibble: 13 × 3
variable n_miss pct_miss
<chr> <int> <dbl>
1 id 0 0
2 make 0 0
3 model 0 0
4 year 0 0
5 decade 0 0
6 mpg 0 0
7 cylinders 0 0
8 displacement 0 0
9 drive 0 0
10 fuel_type 0 0
11 is_turbo 0 0
12 transmission 0 0
13 yearly_fuel_cost 0 0
From this table, we can see that our data has no missing values.
# run box cox model
box_model <- lm(yearly_fuel_cost ~ 1, data = epa_data)
car::boxCox(box_model)The Box-Cox model suggests that lambda is 0, which indicates that we should go with a logarithmic transformation. We will now create a log(yearly_fuel_cost) variable to use as our new outcome variable.
# create log outcome variable
epa_data <- epa_data |>
mutate(log_yearly_fuel_cost = log(yearly_fuel_cost))# dataset we will use for linear regression model
linear_data <- epa_data |>
select(decade, cylinders, displacement, drive, fuel_type,
is_turbo, transmission, log_yearly_fuel_cost)
ggpairs(linear_data, title = "Scatterplot Matrix of Variables")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
From this scatterplot, we can’t really tell if there’s any collinearity between our predictors, so we will use look at variance inflation factors to see if there is.
# create model
modelA <- lm(log_yearly_fuel_cost ~ decade + cylinders +
displacement + drive + fuel_type + is_turbo +
transmission,
data = linear_data)
# check variance inflation factor
car::vif(modelA) GVIF Df GVIF^(1/(2*Df))
decade 1.794883 3 1.102400
cylinders 7.756075 2 1.668824
displacement 7.532555 1 2.744550
drive 2.425388 4 1.117114
fuel_type 1.505218 1 1.226873
is_turbo 1.561387 1 1.249555
transmission 1.191696 1 1.091648
Based on vif(), we have an issue with collinearity with cylinders and displacement as each of the values are above 5. We will drop cylinders from our model as displacement is our quantitative variable.
# create model
modelA <- lm(log_yearly_fuel_cost ~ decade +
displacement + drive + fuel_type + is_turbo +
transmission,
data = linear_data)
# check variance inflation factor
car::vif(modelA) GVIF Df GVIF^(1/(2*Df))
decade 1.760545 3 1.098857
displacement 1.724100 1 1.313050
drive 2.365296 4 1.113617
fuel_type 1.423129 1 1.192950
is_turbo 1.510096 1 1.228860
transmission 1.171442 1 1.082332
As we can see, we now have no issues with collinearity.
# fit model
dist <- datadist(linear_data)
options(datadist = "dist")
modelA_o <- ols(log_yearly_fuel_cost ~ decade +
displacement + drive + fuel_type + is_turbo +
transmission,
data = linear_data, x = TRUE, y = TRUE)
# get coefficient values
tidy(modelA, conf.int = TRUE, conf.level = 0.90) |>
select(term, estimate, conf.low, conf.high, p.value) |>
kable(dig = 4)| term | estimate | conf.low | conf.high | p.value |
|---|---|---|---|---|
| (Intercept) | 7.5334 | 7.4914 | 7.5753 | 0.0000 |
| decade1990s | -0.0176 | -0.0351 | -0.0001 | 0.0988 |
| decade2000s | -0.0462 | -0.0638 | -0.0286 | 0.0000 |
| decade2010+ | -0.2155 | -0.2345 | -0.1965 | 0.0000 |
| displacement | 0.1303 | 0.1245 | 0.1361 | 0.0000 |
| drive4-Wheel or All-Wheel Drive | 0.0224 | -0.0127 | 0.0575 | 0.2937 |
| driveAll-Wheel Drive | -0.0083 | -0.0435 | 0.0268 | 0.6960 |
| driveFront-Wheel Drive | -0.1310 | -0.1635 | -0.0985 | 0.0000 |
| driveRear-Wheel Drive | -0.0383 | -0.0705 | -0.0061 | 0.0507 |
| fuel_typeRegular Gasoline | -0.1996 | -0.2142 | -0.1850 | 0.0000 |
| is_turbo1 | 0.0747 | 0.0559 | 0.0935 | 0.0000 |
| transmissionManual | -0.0112 | -0.0245 | 0.0020 | 0.1618 |
# get key fit summary statistics
glance(modelA) |>
select(r.squared, AIC, BIC) |>
kable(digits = 4)| r.squared | AIC | BIC |
|---|---|---|
| 0.8033 | -1733.576 | -1667.503 |
# get residual plots
par(mfrow = c(2,2)); plot(modelA); par(mfrow = c(1,1))Here, we fit our main model and we can see that it predicts log_yearly_fuel_cost pretty well, with \(R^2\) being 0.803. From the residual plots, we can see that there are no issues with linearity, homoscedasticity, normality, and leverage. This is because for the top left, top right, and bottom left graphs, there are no problematic trends and for the bottom left, none of the points are within the contours of Cook’s distance.
# build spearman object
modA_spear <- spearman2(log_yearly_fuel_cost ~ decade +
displacement + drive + fuel_type + is_turbo +
transmission,
data = linear_data)
plot(modA_spear)By looking at our Spearman \(\rho^2\) plot, we can see that there are two variables that are clearly the most likely to make an impact, which are displacement and drive. Because we are aiming to add around six degrees of freedom, we are going to add a restricted cubic spline of 4 knots to displacement, which will add two additional degrees. We will also add an interaction term between displacement and drive, which will add four degrees of freedom for a total of six additional degrees.
# fit model
modelB <- lm(log_yearly_fuel_cost ~ rcs(displacement, 4) + decade +
drive + fuel_type + is_turbo + transmission +
displacement %ia% drive,
data = linear_data)
dist <- datadist(linear_data)
options(datadist = "dist")
modelB_o <- ols(log_yearly_fuel_cost ~ rcs(displacement, 4) + decade +
drive + fuel_type + is_turbo + transmission +
displacement %ia% drive,
data = linear_data, x = TRUE, y = TRUE)
# get coefficient values
tidy(modelB, conf.int = TRUE, conf.level = 0.90) |>
select(term, estimate, conf.low, conf.high, p.value) |>
kable(dig = 4)| term | estimate | conf.low | conf.high | p.value |
|---|---|---|---|---|
| (Intercept) | 7.1453 | 7.0168 | 7.2738 | 0.0000 |
| rcs(displacement, 4)displacement | 0.2903 | 0.2469 | 0.3336 | 0.0000 |
| rcs(displacement, 4)displacement' | -0.5657 | -0.7191 | -0.4123 | 0.0000 |
| rcs(displacement, 4)displacement'' | 1.0713 | 0.7489 | 1.3936 | 0.0000 |
| decade1990s | -0.0183 | -0.0351 | -0.0015 | 0.0732 |
| decade2000s | -0.0569 | -0.0739 | -0.0398 | 0.0000 |
| decade2010+ | -0.2188 | -0.2370 | -0.2005 | 0.0000 |
| drive4-Wheel or All-Wheel Drive | 0.0900 | -0.0162 | 0.1962 | 0.1632 |
| driveAll-Wheel Drive | 0.0561 | -0.0595 | 0.1718 | 0.4243 |
| driveFront-Wheel Drive | -0.0614 | -0.1712 | 0.0483 | 0.3571 |
| driveRear-Wheel Drive | 0.0696 | -0.0325 | 0.1717 | 0.2620 |
| fuel_typeRegular Gasoline | -0.1946 | -0.2087 | -0.1805 | 0.0000 |
| is_turbo1 | 0.0918 | 0.0733 | 0.1103 | 0.0000 |
| transmissionManual | -0.0030 | -0.0157 | 0.0098 | 0.6999 |
| displacement %ia% drivedisplacement * drive=4-Wheel or All-Wheel Drive | -0.0151 | -0.0430 | 0.0128 | 0.3724 |
| displacement %ia% drivedisplacement * drive=All-Wheel Drive | -0.0177 | -0.0497 | 0.0143 | 0.3626 |
| displacement %ia% drivedisplacement * drive=Front-Wheel Drive | -0.0106 | -0.0423 | 0.0211 | 0.5814 |
| displacement %ia% drivedisplacement * drive=Rear-Wheel Drive | -0.0264 | -0.0533 | 0.0005 | 0.1066 |
# plot of effects
plot(summary(modelB_o))# get residual plots
par(mfrow = c(2,2)); plot(modelB); par(mfrow = c(1,1))Here, we fit our augmented model using both ols and lm so that we are able to get both the effects plot and the residual plots. From the residual plots, we can see that there are again no issues with linearity, homoscedasticity, normality, and leverage. This is because for the top left, top right, and bottom left graphs, there are no problematic trends and for the bottom left, none of the points are within the contours of Cook’s distance.
# validate model A
set.seed(123454321)
validate(modelA_o) index.orig training test optimism index.corrected n
R-square 0.8033 0.8072 0.8014 0.0058 0.7975 40
MSE 0.0134 0.0131 0.0135 -0.0004 0.0137 40
g 0.2670 0.2677 0.2667 0.0010 0.2660 40
Intercept 0.0000 0.0000 0.0011 -0.0011 0.0011 40
Slope 1.0000 1.0000 0.9999 0.0001 0.9999 40
# validate model B
set.seed(123454321)
validate(modelB_o) index.orig training test optimism index.corrected n
R-square 0.8216 0.8268 0.8188 0.0080 0.8136 40
MSE 0.0121 0.0118 0.0123 -0.0005 0.0126 40
g 0.2704 0.2712 0.2699 0.0013 0.2691 40
Intercept 0.0000 0.0000 0.0173 -0.0173 0.0173 40
Slope 1.0000 1.0000 0.9978 0.0022 0.9978 40
After validating the main effects and augmented models, we can see the validated \(R^2\) and mean squared error values. In terms of both metrics, it appears that the augmented model performs slightly better.
After everything, I would choose the main effects model over the augmented model even though the augmented model performed better. As seen before, the residual plots for both models were nearly the same and the augmented model’s validated \(R^2\) was only 0.0161 better than the main effect’s validated \(R^2\). Similarly, the validated MSE of the augmented model is 0.0011 better than the validated MSE of the main effect model. In my opinion, these small increase aren’t worth adding an extra six degrees of freedom to our main effects model.
# get coefficient values
tidy(modelA, conf.int = TRUE, conf.level = 0.90) |>
select(term, estimate, conf.low, conf.high, p.value) |>
kable(dig = 4)| term | estimate | conf.low | conf.high | p.value |
|---|---|---|---|---|
| (Intercept) | 7.5334 | 7.4914 | 7.5753 | 0.0000 |
| decade1990s | -0.0176 | -0.0351 | -0.0001 | 0.0988 |
| decade2000s | -0.0462 | -0.0638 | -0.0286 | 0.0000 |
| decade2010+ | -0.2155 | -0.2345 | -0.1965 | 0.0000 |
| displacement | 0.1303 | 0.1245 | 0.1361 | 0.0000 |
| drive4-Wheel or All-Wheel Drive | 0.0224 | -0.0127 | 0.0575 | 0.2937 |
| driveAll-Wheel Drive | -0.0083 | -0.0435 | 0.0268 | 0.6960 |
| driveFront-Wheel Drive | -0.1310 | -0.1635 | -0.0985 | 0.0000 |
| driveRear-Wheel Drive | -0.0383 | -0.0705 | -0.0061 | 0.0507 |
| fuel_typeRegular Gasoline | -0.1996 | -0.2142 | -0.1850 | 0.0000 |
| is_turbo1 | 0.0747 | 0.0559 | 0.0935 | 0.0000 |
| transmissionManual | -0.0112 | -0.0245 | 0.0020 | 0.1618 |
These are the coefficients of our final model with a 90% confidence interval.
# get numeric effect sizes
summary(modelA_o) |> kable(digits = 3)| Low | High | Diff. | Effect | S.E. | Lower 0.95 | Upper 0.95 | Type | |
|---|---|---|---|---|---|---|---|---|
| displacement | 2.2 | 4 | 1.8 | 0.235 | 0.006 | 0.222 | 0.247 | 1 |
| decade - 1980s:2010+ | 4.0 | 1 | NA | 0.216 | 0.012 | 0.193 | 0.238 | 1 |
| decade - 1990s:2010+ | 4.0 | 2 | NA | 0.198 | 0.011 | 0.177 | 0.219 | 1 |
| decade - 2000s:2010+ | 4.0 | 3 | NA | 0.169 | 0.010 | 0.149 | 0.190 | 1 |
| drive - 4-Wheel Drive:Front-Wheel Drive | 4.0 | 1 | NA | 0.131 | 0.020 | 0.092 | 0.170 | 1 |
| drive - 4-Wheel or All-Wheel Drive:Front-Wheel Drive | 4.0 | 2 | NA | 0.153 | 0.011 | 0.131 | 0.175 | 1 |
| drive - All-Wheel Drive:Front-Wheel Drive | 4.0 | 3 | NA | 0.123 | 0.015 | 0.093 | 0.152 | 1 |
| drive - Rear-Wheel Drive:Front-Wheel Drive | 4.0 | 5 | NA | 0.093 | 0.010 | 0.074 | 0.112 | 1 |
| fuel_type - Premium Gasoline:Regular Gasoline | 2.0 | 1 | NA | 0.200 | 0.009 | 0.182 | 0.217 | 1 |
| is_turbo - 1:0 | 1.0 | 2 | NA | 0.075 | 0.011 | 0.052 | 0.097 | 1 |
| transmission - Manual:Automatic | 1.0 | 2 | NA | -0.011 | 0.008 | -0.027 | 0.005 | 1 |
# plot effect sizes
plot(summary(modelA_o))displacement: If we have two subjects from the same decade, cylinders, drive, fuel type, is_turbo, and transmission, then if subject 1 has a displacement of 2.2 liters and subject 2 has a displacement of 4 liters, then the model estimates that subject 2 will have a log_yearly_fuel_cost that is 0.235 higher than subject 1. The 95% confidence interval around that estimated effect on log_yearly_fuel_cost ranges from (0.222, 0.247). The increase in yearly fuel cost makes sense with higher displacement engines because displacement measures the volume of air that is moved by pistons inside the cylinders of an engine. Because of the combustion processes occurring within cylinders, the engine’s air/fuel ratio has to stay relatively constant to maintain a healthy engine. As such, more displacement means that more air is being taken in by the engine, in turn leading to more fuel consumption. With all of this, more fuel is used which means that more money has to be spent on fuel.
# validate model A
set.seed(123454321)
validate(modelA_o) index.orig training test optimism index.corrected n
R-square 0.8033 0.8072 0.8014 0.0058 0.7975 40
MSE 0.0134 0.0131 0.0135 -0.0004 0.0137 40
g 0.2670 0.2677 0.2667 0.0010 0.2660 40
Intercept 0.0000 0.0000 0.0011 -0.0011 0.0011 40
Slope 1.0000 1.0000 0.9999 0.0001 0.9999 40
Again, the validated \(R^2\) value for Model A is 0.7975, which is pretty good.
# nomogram
plot(nomogram(modelA_o, fun = exp, abbrev = TRUE),
cex.axis = 0.4) # font sizeHere is the model A nomogram, where we can make a prediction as to a vehicle’s yearly fuel cost using set input parameters. As examples, we will select some vehicles that are in the original dataset but were excluded from the final dataset due to size.
# make original dataset look same as final dataset
data <- data |>
mutate(log_yearly_fuel_cost = log(yearly_fuel_cost)) |>
mutate(decade = case_when(year < 1990 ~ "1980s",
1990 <= year & year < 2000 ~ "1990s",
2000 <= year & year < 2010 ~ "2000s",
TRUE ~ "2010+"))
# get unmatched rows
notInc <- anti_join(data, epa_data)Joining with `by = join_by(id, make, model, year, decade, mpg, cylinders,
displacement, drive, yearly_fuel_cost, fuel_type, transmission, is_turbo,
log_yearly_fuel_cost)`
# find cars
bmw <- notInc |>
filter(make == "BMW") |>
filter(model == "M3") |>
filter(year == 1996) |>
filter(transmission == "Manual")
mini <- notInc |>
filter(make == "MINI") |>
filter(model == "Clubman") |>
filter(year == 2011) |>
filter(transmission == "Manual")
# get predicted values
bmw_pred <- exp(predict.lm(modelA, newdata = bmw,
interval = "prediction", level = 0.90))
mini_pred <- exp(predict.lm(modelA, newdata = mini,
interval = "prediction", level = 0.90))
bmw_pred fit lwr upr
1 2652.335 2188.382 3214.648
bmw |> select(yearly_fuel_cost)# A tibble: 1 × 1
yearly_fuel_cost
<dbl>
1 2350
mini_pred fit lwr upr
1 1610.039 1328.101 1951.827
mini |> select(yearly_fuel_cost)# A tibble: 1 × 1
yearly_fuel_cost
<dbl>
1 1650
Here, we will get predictions of yearly fuel cost for two cars that my parents had. For the BMW, it was predicted that the yearly fuel cost would be 2,652.335 with a 90% confidence interval of (2,188.382, 3,214.648). The actual yearly fuel cost was 2,350, so it wasn’t that close to the prediction but it did fall within the confidence interval. For the Mini, it was predicted that the yearly fuel cost would be 1,610.039 with a 90% confidence interval of (1,328.101, 1,951.827). The actual yearly fuel cost was 1,650, which is quite close to the predicted value.
miss_var_summary(epa_data)# A tibble: 14 × 3
variable n_miss pct_miss
<chr> <int> <dbl>
1 id 0 0
2 make 0 0
3 model 0 0
4 year 0 0
5 decade 0 0
6 mpg 0 0
7 cylinders 0 0
8 displacement 0 0
9 drive 0 0
10 fuel_type 0 0
11 is_turbo 0 0
12 transmission 0 0
13 yearly_fuel_cost 0 0
14 log_yearly_fuel_cost 0 0
From this table, we can see that our data has no missing values.
# data for logistic regression
logistic_data <- epa_data |>
select(decade, mpg, cylinders, displacement,
drive, is_turbo, transmission) |>
mutate(is_auto = case_when(transmission == "Automatic" ~ 1,
TRUE ~ 0))
# fit glm model
modelY_glm <- glm(is_auto ~ decade + mpg + cylinders +
displacement + drive + is_turbo,
data = logistic_data, family = binomial(link = "logit"))
# data distribution
dist <- datadist(logistic_data)
options(datadist = "dist")
# fit lrm model
modelY_lrm <- lrm(is_auto ~ decade + mpg + cylinders +
displacement + drive + is_turbo,
data = logistic_data, x = TRUE, y = TRUE)
# get coefficient values
tidy(modelY_glm, exponentiate = TRUE, conf.int = TRUE, conf.level = 0.90) |>
select(term, estimate, conf.low, conf.high, p.value) |>
kable(dig = 4)| term | estimate | conf.low | conf.high | p.value |
|---|---|---|---|---|
| (Intercept) | 0.8469 | 0.1749 | 4.4932 | 0.8651 |
| decade1990s | 1.0672 | 0.7682 | 1.4818 | 0.7445 |
| decade2000s | 1.6589 | 1.1861 | 2.3215 | 0.0131 |
| decade2010+ | 3.3587 | 2.1590 | 5.2760 | 0.0000 |
| mpg | 0.9859 | 0.9463 | 1.0284 | 0.5737 |
| cylinders6 | 2.3150 | 1.5174 | 3.5476 | 0.0011 |
| cylinders8 | 2.3322 | 1.0318 | 5.3079 | 0.0887 |
| displacement | 1.5127 | 1.1524 | 1.9930 | 0.0129 |
| drive4-Wheel or All-Wheel Drive | 0.3204 | 0.0938 | 0.8650 | 0.0849 |
| driveAll-Wheel Drive | 1.4390 | 0.3802 | 4.7397 | 0.6268 |
| driveFront-Wheel Drive | 0.6582 | 0.1962 | 1.7317 | 0.5179 |
| driveRear-Wheel Drive | 0.2454 | 0.0735 | 0.6390 | 0.0288 |
| is_turbo1 | 0.9604 | 0.6616 | 1.3971 | 0.8586 |
# effects sizes
summary(modelY_lrm) Effects Response : is_auto
Factor Low High Diff. Effect
mpg 17.0 23 6.0 -0.085039
Odds Ratio 17.0 23 6.0 0.918480
displacement 2.2 4 1.8 0.744980
Odds Ratio 2.2 4 1.8 2.106400
decade - 1980s:2010+ 4.0 1 NA -1.211600
Odds Ratio 4.0 1 NA 0.297730
decade - 1990s:2010+ 4.0 2 NA -1.146500
Odds Ratio 4.0 2 NA 0.317740
decade - 2000s:2010+ 4.0 3 NA -0.705440
Odds Ratio 4.0 3 NA 0.493890
cylinders - 6:4 1.0 2 NA 0.839400
Odds Ratio 1.0 2 NA 2.315000
cylinders - 8:4 1.0 3 NA 0.846820
Odds Ratio 1.0 3 NA 2.332200
drive - 4-Wheel Drive:Front-Wheel Drive 4.0 1 NA 0.418210
Odds Ratio 4.0 1 NA 1.519200
drive - 4-Wheel or All-Wheel Drive:Front-Wheel Drive 4.0 2 NA -0.720050
Odds Ratio 4.0 2 NA 0.486730
drive - All-Wheel Drive:Front-Wheel Drive 4.0 3 NA 0.782150
Odds Ratio 4.0 3 NA 2.186200
drive - Rear-Wheel Drive:Front-Wheel Drive 4.0 5 NA -0.986640
Odds Ratio 4.0 5 NA 0.372830
is_turbo - 1:0 1.0 2 NA -0.040442
Odds Ratio 1.0 2 NA 0.960370
S.E. Lower 0.95 Upper 0.95
0.15113 -0.38126 0.21118
NA 0.68300 1.23510
0.29947 0.15804 1.33190
NA 1.17120 3.78830
0.27135 -1.74340 -0.67973
NA 0.17492 0.50675
0.26176 -1.65960 -0.63347
NA 0.19022 0.53075
0.25478 -1.20480 -0.20607
NA 0.29975 0.81377
0.25800 0.33373 1.34510
NA 1.39620 3.83850
0.49750 -0.12827 1.82190
NA 0.87962 6.18360
0.64688 -0.84964 1.68610
NA 0.42757 5.39820
0.23623 -1.18310 -0.25704
NA 0.30634 0.77334
0.47723 -0.15321 1.71750
NA 0.85795 5.57060
0.20447 -1.38740 -0.58589
NA 0.24973 0.55661
0.22696 -0.48527 0.40439
NA 0.61553 1.49840
plot(summary(modelY_lrm)) # summary statistics
modelY_lrm$stats["C"] C
0.752597
modelY_lrm$stats["R2"] R2
0.2315688
# confusion matrix
confusion_data <- augment(modelY_glm, type.predict = "response")
confMatrix <- confusionMatrix(data = factor(confusion_data$.fitted >= 0.5),
reference = factor(confusion_data$is_auto == 1),
positive = "TRUE")
confMatrixConfusion Matrix and Statistics
Reference
Prediction FALSE TRUE
FALSE 120 84
TRUE 225 762
Accuracy : 0.7406
95% CI : (0.7147, 0.7652)
No Information Rate : 0.7103
P-Value [Acc > NIR] : 0.01108
Kappa : 0.2828
Mcnemar's Test P-Value : 1.661e-15
Sensitivity : 0.9007
Specificity : 0.3478
Pos Pred Value : 0.7720
Neg Pred Value : 0.5882
Prevalence : 0.7103
Detection Rate : 0.6398
Detection Prevalence : 0.8287
Balanced Accuracy : 0.6243
'Positive' Class : TRUE
Here, we create both glm() and lrm() models for our model Y where we can see numerous things about the main effect. The coefficients are already exponentiated and represent the odds ratio of a vehicle’s transmission being automatic, so for example the coefficient for is_turbo1 is 0.9604. This means that with all other variables constant, a car with a turbo is 0.9604 times as likely to have an automatic transmission as one without a turbo. We can then see the numeric effect size values along with a plot of the effects. These effect values represent how the odds ratio changes for the specified change. With cylinders - 8:4 for instance, the effect size is 0.846820. This means that with all other variables held constant, a car with an engine with 8 cylinders will be 0.846820 times as likely to be an automatic as a car with a 4 cylinder engine. The Nagelkerke \(R^2\) value is 0.232 and the C statistic, or area under the ROC curve, is 0.753, which is pretty decent. For the confusion matrix, we used a prediction rule of if a fitted value is greater than 0.5, the observation will be classified as having an automatic transmission. The specificity is 0.3478, the sensitivity is 0.9007, and the positive predictive value is 0.7720.
# build spearman object
modY_spear <- spearman2(transmission ~ decade + mpg + cylinders +
displacement + drive + is_turbo,
data = logistic_data)
plot(modY_spear)By looking at our Spearman \(\rho^2\) plot, we can see that cylinders, displacement, and decade are the variables most likely to make a difference. Because we are aiming to add between three and six degrees of freedom, we are going to add an interaction term between cylinders and displacement, which will add two degrees. We are also going to add an interaction term between displacement and decade, which will add another three degrees for a total of five.
# fit glm model
modelZ_glm <- glm(is_auto ~ decade + mpg + cylinders +
displacement + drive + is_turbo +
displacement * cylinders + displacement * decade,
data = logistic_data, family = binomial(link = "logit"))
# data distribution
dist <- datadist(logistic_data)
options(datadist = "dist")
# fit lrm model
modelZ_lrm <- lrm(is_auto ~ decade + mpg + cylinders +
displacement + drive + is_turbo +
displacement * cylinders + displacement * decade,
data = logistic_data, x = TRUE, y = TRUE)
# get coefficient values
tidy(modelZ_glm, exponentiate = TRUE, conf.int = TRUE, conf.level = 0.90) |>
select(term, estimate, conf.low, conf.high, p.value) |>
kable(dig = 4)| term | estimate | conf.low | conf.high | p.value |
|---|---|---|---|---|
| (Intercept) | 0.2271 | 0.0250 | 2.0748 | 0.2677 |
| decade1990s | 1.4086 | 0.6118 | 3.2645 | 0.5004 |
| decade2000s | 0.9744 | 0.3731 | 2.5311 | 0.9644 |
| decade2010+ | 1.7453 | 0.6015 | 4.9932 | 0.3858 |
| mpg | 1.0051 | 0.9609 | 1.0534 | 0.8550 |
| cylinders6 | 13.4114 | 2.4932 | 73.2019 | 0.0114 |
| cylinders8 | 1.8698 | 0.1061 | 31.1270 | 0.7165 |
| displacement | 2.2675 | 1.2787 | 4.0666 | 0.0198 |
| drive4-Wheel or All-Wheel Drive | 0.3569 | 0.1036 | 0.9774 | 0.1226 |
| driveAll-Wheel Drive | 1.5473 | 0.4061 | 5.1475 | 0.5625 |
| driveFront-Wheel Drive | 0.7013 | 0.2079 | 1.8632 | 0.5859 |
| driveRear-Wheel Drive | 0.2670 | 0.0794 | 0.7041 | 0.0416 |
| is_turbo1 | 1.0853 | 0.7362 | 1.6054 | 0.7295 |
| cylinders6:displacement | 0.5137 | 0.2733 | 0.9612 | 0.0812 |
| cylinders8:displacement | 0.8301 | 0.4015 | 1.7300 | 0.6745 |
| decade1990s:displacement | 0.9118 | 0.7023 | 1.1789 | 0.5565 |
| decade2000s:displacement | 1.1784 | 0.8772 | 1.5891 | 0.3623 |
| decade2010+:displacement | 1.2233 | 0.8765 | 1.7410 | 0.3317 |
# effects sizes
summary(modelZ_lrm) Effects Response : is_auto
Factor Low High Diff. Effect
mpg 17.0 23 6.0 0.030549
Odds Ratio 17.0 23 6.0 1.031000
displacement 2.2 4 1.8 1.836500
Odds Ratio 2.2 4 1.8 6.274500
decade - 1980s:2010+ 4.0 1 NA -1.161700
Odds Ratio 4.0 1 NA 0.312960
decade - 1990s:2010+ 4.0 2 NA -1.096200
Odds Ratio 4.0 2 NA 0.334130
decade - 2000s:2010+ 4.0 3 NA -0.695180
Odds Ratio 4.0 3 NA 0.498980
cylinders - 6:4 1.0 2 NA 0.597630
Odds Ratio 1.0 2 NA 1.817800
cylinders - 8:4 1.0 3 NA 0.067039
Odds Ratio 1.0 3 NA 1.069300
drive - 4-Wheel Drive:Front-Wheel Drive 4.0 1 NA 0.354830
Odds Ratio 4.0 1 NA 1.425900
drive - 4-Wheel or All-Wheel Drive:Front-Wheel Drive 4.0 2 NA -0.675520
Odds Ratio 4.0 2 NA 0.508890
drive - All-Wheel Drive:Front-Wheel Drive 4.0 3 NA 0.791350
Odds Ratio 4.0 3 NA 2.206400
drive - Rear-Wheel Drive:Front-Wheel Drive 4.0 5 NA -0.965840
Odds Ratio 4.0 5 NA 0.380660
is_turbo - 1:0 1.0 2 NA 0.081856
Odds Ratio 1.0 2 NA 1.085300
S.E. Lower 0.95 Upper 0.95
0.16719 -0.297130 0.35823
NA 0.742940 1.43080
0.66794 0.527350 3.14560
NA 1.694400 23.23500
0.28378 -1.717900 -0.60550
NA 0.179450 0.54580
0.27414 -1.633500 -0.55894
NA 0.195240 0.57182
0.26398 -1.212600 -0.17779
NA 0.297430 0.83712
0.32532 -0.039995 1.23520
NA 0.960790 3.43920
0.70991 -1.324400 1.45840
NA 0.265970 4.29920
0.65132 -0.921730 1.63140
NA 0.397830 5.11090
0.23916 -1.144300 -0.20677
NA 0.318460 0.81321
0.48019 -0.149800 1.73250
NA 0.860880 5.65480
0.20700 -1.371600 -0.56012
NA 0.253710 0.57114
0.23672 -0.382110 0.54582
NA 0.682420 1.72600
Adjusted to: decade=2010+ cylinders=4 displacement=3
plot(summary(modelZ_lrm)) # summary statistics
modelZ_lrm$stats["C"] C
0.7557988
modelZ_lrm$stats["R2"] R2
0.2382902
# confusion matrix
confusion_data <- augment(modelZ_glm, type.predict = "response")
confMatrix <- confusionMatrix(data = factor(confusion_data$.fitted >= 0.5),
reference = factor(confusion_data$is_auto == 1),
positive = "TRUE")
confMatrixConfusion Matrix and Statistics
Reference
Prediction FALSE TRUE
FALSE 109 79
TRUE 236 767
Accuracy : 0.7355
95% CI : (0.7095, 0.7604)
No Information Rate : 0.7103
P-Value [Acc > NIR] : 0.02894
Kappa : 0.2572
Mcnemar's Test P-Value : < 2e-16
Sensitivity : 0.9066
Specificity : 0.3159
Pos Pred Value : 0.7647
Neg Pred Value : 0.5798
Prevalence : 0.7103
Detection Rate : 0.6440
Detection Prevalence : 0.8421
Balanced Accuracy : 0.6113
'Positive' Class : TRUE
Here, we create both glm() and lrm() models for our model Z. As before, the coefficients are already exponentiated and represent the odds ratio of a vehicle’s transmission being automatic. For example the coefficient for is_turbo1 is 1.0853, which means that with all other variables constant, a car with a turbo is 1.0853 times as likely to have an automatic transmission as one without a turbo. The effect size values represent how the odds ratio changes for the specified change. With cylinders - 8:4 for instance, the effect size is 0.067039. This means that with all other variables held constant, a car with an engine with 8 cylinders will be 0.067039 times as likely to be an automatic as a car with a 4 cylinder engine. The Nagelkerke \(R^2\) value is 0.238 and the C statistic, or area under the ROC curve, is 0.756. For the confusion matrix, we used a prediction rule of if a fitted value is greater than 0.5, the observation will be classified as having an automatic transmission. The specificity is 0.3159, the sensitivity is 0.9066, and the positive predictive value is 0.7647.
# validate model Y
set.seed(123454321)
validate(modelY_lrm) index.orig training test optimism index.corrected n
Dxy 0.5051 0.5175 0.4942 0.0233 0.4818 40
R2 0.2316 0.2426 0.2209 0.0217 0.2099 40
Intercept 0.0000 0.0000 0.0478 -0.0478 0.0478 40
Slope 1.0000 1.0000 0.9344 0.0656 0.9344 40
Emax 0.0000 0.0000 0.0233 0.0233 0.0233 40
D 0.1760 0.1854 0.1671 0.0183 0.1577 40
U -0.0017 -0.0017 0.0010 -0.0027 0.0010 40
Q 0.1777 0.1871 0.1661 0.0210 0.1567 40
B 0.1724 0.1705 0.1745 -0.0040 0.1763 40
g 1.2447 1.2998 1.2161 0.0837 1.1610 40
gp 0.2097 0.2141 0.2043 0.0097 0.2000 40
# validate model Z
set.seed(123454321)
validate(modelZ_lrm) index.orig training test optimism index.corrected n
Dxy 0.5116 0.5261 0.4947 0.0314 0.4802 40
R2 0.2383 0.2541 0.2234 0.0306 0.2076 40
Intercept 0.0000 0.0000 0.0665 -0.0665 0.0665 40
Slope 1.0000 1.0000 0.9048 0.0952 0.9048 40
Emax 0.0000 0.0000 0.0337 0.0337 0.0337 40
D 0.1816 0.1952 0.1692 0.0260 0.1556 40
U -0.0017 -0.0017 0.0017 -0.0034 0.0017 40
Q 0.1833 0.1969 0.1675 0.0294 0.1539 40
B 0.1716 0.1690 0.1745 -0.0055 0.1771 40
g 1.3150 1.3938 1.2645 0.1292 1.1858 40
gp 0.2131 0.2194 0.2056 0.0138 0.1993 40
By validating each of these models, we can see that for model Y, the validated \(R^2\) is 0.210 and the validated C statistic is 0.741. For model Z, the validated \(R^2\) is 0.208 and the validated C statistic is 0.740.
Model Y is better in terms of specificity, positive predictive value, validated \(R^2\), and validated C while model Z is better in only sensitivity. Because of this as well is the simplified nature of the model, we will select model Y as our final model.
# get parameters
tidy(modelY_glm, exponentiate = TRUE, conf.int = TRUE, conf.level = 0.90) |>
select(term, estimate, conf.low, conf.high, p.value) |>
kable(dig = 4)| term | estimate | conf.low | conf.high | p.value |
|---|---|---|---|---|
| (Intercept) | 0.8469 | 0.1749 | 4.4932 | 0.8651 |
| decade1990s | 1.0672 | 0.7682 | 1.4818 | 0.7445 |
| decade2000s | 1.6589 | 1.1861 | 2.3215 | 0.0131 |
| decade2010+ | 3.3587 | 2.1590 | 5.2760 | 0.0000 |
| mpg | 0.9859 | 0.9463 | 1.0284 | 0.5737 |
| cylinders6 | 2.3150 | 1.5174 | 3.5476 | 0.0011 |
| cylinders8 | 2.3322 | 1.0318 | 5.3079 | 0.0887 |
| displacement | 1.5127 | 1.1524 | 1.9930 | 0.0129 |
| drive4-Wheel or All-Wheel Drive | 0.3204 | 0.0938 | 0.8650 | 0.0849 |
| driveAll-Wheel Drive | 1.4390 | 0.3802 | 4.7397 | 0.6268 |
| driveFront-Wheel Drive | 0.6582 | 0.1962 | 1.7317 | 0.5179 |
| driveRear-Wheel Drive | 0.2454 | 0.0735 | 0.6390 | 0.0288 |
| is_turbo1 | 0.9604 | 0.6616 | 1.3971 | 0.8586 |
# effect sizes
summary(modelY_lrm) |> kable(digits = 3)| Low | High | Diff. | Effect | S.E. | Lower 0.95 | Upper 0.95 | Type | |
|---|---|---|---|---|---|---|---|---|
| mpg | 17.0 | 23 | 6.0 | -0.085 | 0.151 | -0.381 | 0.211 | 1 |
| Odds Ratio | 17.0 | 23 | 6.0 | 0.918 | NA | 0.683 | 1.235 | 2 |
| displacement | 2.2 | 4 | 1.8 | 0.745 | 0.299 | 0.158 | 1.332 | 1 |
| Odds Ratio | 2.2 | 4 | 1.8 | 2.106 | NA | 1.171 | 3.788 | 2 |
| decade - 1980s:2010+ | 4.0 | 1 | NA | -1.212 | 0.271 | -1.743 | -0.680 | 1 |
| Odds Ratio | 4.0 | 1 | NA | 0.298 | NA | 0.175 | 0.507 | 2 |
| decade - 1990s:2010+ | 4.0 | 2 | NA | -1.147 | 0.262 | -1.660 | -0.633 | 1 |
| Odds Ratio | 4.0 | 2 | NA | 0.318 | NA | 0.190 | 0.531 | 2 |
| decade - 2000s:2010+ | 4.0 | 3 | NA | -0.705 | 0.255 | -1.205 | -0.206 | 1 |
| Odds Ratio | 4.0 | 3 | NA | 0.494 | NA | 0.300 | 0.814 | 2 |
| cylinders - 6:4 | 1.0 | 2 | NA | 0.839 | 0.258 | 0.334 | 1.345 | 1 |
| Odds Ratio | 1.0 | 2 | NA | 2.315 | NA | 1.396 | 3.838 | 2 |
| cylinders - 8:4 | 1.0 | 3 | NA | 0.847 | 0.498 | -0.128 | 1.822 | 1 |
| Odds Ratio | 1.0 | 3 | NA | 2.332 | NA | 0.880 | 6.184 | 2 |
| drive - 4-Wheel Drive:Front-Wheel Drive | 4.0 | 1 | NA | 0.418 | 0.647 | -0.850 | 1.686 | 1 |
| Odds Ratio | 4.0 | 1 | NA | 1.519 | NA | 0.428 | 5.398 | 2 |
| drive - 4-Wheel or All-Wheel Drive:Front-Wheel Drive | 4.0 | 2 | NA | -0.720 | 0.236 | -1.183 | -0.257 | 1 |
| Odds Ratio | 4.0 | 2 | NA | 0.487 | NA | 0.306 | 0.773 | 2 |
| drive - All-Wheel Drive:Front-Wheel Drive | 4.0 | 3 | NA | 0.782 | 0.477 | -0.153 | 1.718 | 1 |
| Odds Ratio | 4.0 | 3 | NA | 2.186 | NA | 0.858 | 5.571 | 2 |
| drive - Rear-Wheel Drive:Front-Wheel Drive | 4.0 | 5 | NA | -0.987 | 0.204 | -1.387 | -0.586 | 1 |
| Odds Ratio | 4.0 | 5 | NA | 0.373 | NA | 0.250 | 0.557 | 2 |
| is_turbo - 1:0 | 1.0 | 2 | NA | -0.040 | 0.227 | -0.485 | 0.404 | 1 |
| Odds Ratio | 1.0 | 2 | NA | 0.960 | NA | 0.616 | 1.498 | 2 |
plot(summary(modelY_lrm)) decade: If we have two subjects from the same displacement, cylinders, drive, fuel type, is_turbo, and transmission, then if subject 1 was from the 1980’s and subject 2 was from the 2010+, then the model predicts subject 1 is 0.298 times as likely as subject 2 to have an automatic transmission. This makes sense as automatic cars have been becoming much more prominent, particularly in the United States. As such, it is much more likely for a more recent car to have an automatic transmission than an older car.
# plot ROC curve
predictions <- predict(modelY_glm, type = "response")
rocCurve <- roc(modelY_glm$data$is_auto, predictions)Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(rocCurve, col = "blue")
legend("topleft", legend = paste("AUC: ", round_half_up(auc(rocCurve), 3)))Here is our ROC curve for model Y with our entire dataset.
# statistics
set.seed(123454321)
validate(modelY_lrm) index.orig training test optimism index.corrected n
Dxy 0.5051 0.5175 0.4942 0.0233 0.4818 40
R2 0.2316 0.2426 0.2209 0.0217 0.2099 40
Intercept 0.0000 0.0000 0.0478 -0.0478 0.0478 40
Slope 1.0000 1.0000 0.9344 0.0656 0.9344 40
Emax 0.0000 0.0000 0.0233 0.0233 0.0233 40
D 0.1760 0.1854 0.1671 0.0183 0.1577 40
U -0.0017 -0.0017 0.0010 -0.0027 0.0010 40
Q 0.1777 0.1871 0.1661 0.0210 0.1567 40
B 0.1724 0.1705 0.1745 -0.0040 0.1763 40
g 1.2447 1.2998 1.2161 0.0837 1.1610 40
gp 0.2097 0.2141 0.2043 0.0097 0.2000 40
0.5 + (0.4818 / 2)[1] 0.7409
Here, we can see our validated \(R^2\) is 0.210 and our validated C statistic is 0.7409.
# nomogram
plot(nomogram(modelY_lrm))Here is the nomogram for model Y.
# find cars for example
oldAccord <- epa_data |>
filter(make == "Honda") |>
filter(model == "Accord") |>
filter(year == 1989) |>
filter(transmission == "Manual")
newAccord <- epa_data |>
filter(make == "Honda") |>
filter(model == "Accord") |>
filter(year == 2012) |>
filter(transmission == "Manual")
# see each of their values
oldAccord# A tibble: 1 × 14
id make model year decade mpg cylinders displa…¹ drive fuel_…² is_tu…³
<chr> <chr> <chr> <chr> <fct> <dbl> <fct> <dbl> <fct> <fct> <fct>
1 5558 Honda Accord 1989 1980s 24 4 2 Fron… Regula… 0
# … with 3 more variables: transmission <fct>, yearly_fuel_cost <dbl>,
# log_yearly_fuel_cost <dbl>, and abbreviated variable names ¹displacement,
# ²fuel_type, ³is_turbo
newAccord# A tibble: 1 × 14
id make model year decade mpg cylinders displa…¹ drive fuel_…² is_tu…³
<chr> <chr> <chr> <chr> <fct> <dbl> <fct> <dbl> <fct> <fct> <fct>
1 31787 Honda Accord 2012 2010+ 27 4 2.4 Fron… Regula… 0
# … with 3 more variables: transmission <fct>, yearly_fuel_cost <dbl>,
# log_yearly_fuel_cost <dbl>, and abbreviated variable names ¹displacement,
# ²fuel_type, ³is_turbo
# get predicted values
predict(modelY_glm, newdata = oldAccord, type = "response") 1
0.4758318
predict(modelY_glm, newdata = newAccord, type = "response") 1
0.7751896
For our predicted probability example between two subjects of interest, we are using two manual Honda Accords, with one being from 1989 and the 2012. The parameter values are very similar, with the different ones being decade, mpg, and displacement. The mpg and displacement values are still similar between the two models, so the big contrasting parameter is decade. As we can see, the probability of the 1989 model having an automatic transmission is 0.476 and the probability of the 2012 model having an automatic transmission is 0.775.
Something that was substantially harder than I expected was getting my nomogram to present decently well for my final linear model. For the decade and drive variables, a couple of the levels’ labels would clash with each other because their point values are very close together. As such, I had to play around with the nomogram function’s parameters for awhile to make it so that all of the labels would be visible and obvious. While this isn’t a particularly hard issue to fix, it was tedious in having to look up the function documentation and then playing with various parameters to get an acceptable result.
The most useful thing that I learned was with interpreting the Spearman plots and creating interaction terms for our models. While I already knew how to independently read Spearman plots and create interaction terms, I didn’t have practical experience with determining what interaction terms to create based on additional degrees of freedom. This is useful because we want to be careful in how many degrees of freedom to add to the model in practical applications, so it was good to have to take this into account.
I am certain that it is completely appropriate for these EPA data to be shared with anyone, without any conditions. There are no concerns about privacy or security.
The data that we are using is located on Github (https://github.com/rfordatascience/tidytuesday/tree/master/data/2019/2019-10-15) and is originally from the Environmental Protection Agency (EPA). The dataset is similar to the mtcars dataset that is already built into R, except that it includes many more vehicles and variables. The full data dictionary for the dataset can also be found on fueleconomy.gov (https://www.fueleconomy.gov/feg/ws/index.shtml#fuelType1).
xfun::session_info()R version 4.2.2 (2022-10-31)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur ... 10.16
Locale: en_US.UTF-8 / en_US.UTF-8 / en_US.UTF-8 / C / en_US.UTF-8 / en_US.UTF-8
Package version:
abind_1.4-5 askpass_1.1 backports_1.4.1
base64enc_0.1-3 bit_4.0.5 bit64_4.0.5
blob_1.2.3 boot_1.3.28.1 brio_1.1.3
broom_1.0.3 bslib_0.4.2 cachem_1.0.7
callr_3.7.3 car_3.1-1 carData_3.0-5
caret_6.0-93 cellranger_1.1.0 checkmate_2.1.0
class_7.3-21 cli_3.6.0 clipr_0.8.0
clock_0.6.1 cluster_2.1.4 codetools_0.2-19
colorspace_2.1-0 compiler_4.2.2 conflicted_1.2.0
cpp11_0.4.3 crayon_1.5.2 curl_5.0.0
data.table_1.14.8 DBI_1.1.3 dbplyr_2.3.1
deldir_1.0-6 desc_1.4.2 diffobj_0.3.5
digest_0.6.31 dplyr_1.1.0 dtplyr_1.3.0
e1071_1.7-13 ellipsis_0.3.2 evaluate_0.20
fansi_1.0.4 farver_2.1.1 fastmap_1.1.1
forcats_1.0.0 foreach_1.5.2 foreign_0.8-84
Formula_1.2-5 fs_1.6.1 future_1.31.0
future.apply_1.10.0 gargle_1.3.0 generics_0.1.3
GGally_2.1.2 ggplot2_3.4.1 globals_0.16.2
glue_1.6.2 googledrive_2.0.0 googlesheets4_1.0.1
gower_1.0.1 graphics_4.2.2 grDevices_4.2.2
grid_4.2.2 gridExtra_2.3 gtable_0.3.1
hardhat_1.2.0 haven_2.5.1 highr_0.10
Hmisc_4.8-0 hms_1.1.2 htmlTable_2.4.1
htmltools_0.5.4 htmlwidgets_1.6.1 httr_1.4.5
ids_1.0.1 interp_1.1-3 ipred_0.9-13
isoband_0.2.7 iterators_1.0.14 janitor_2.2.0
jpeg_0.1-10 jquerylib_0.1.4 jsonlite_1.8.4
kableExtra_1.3.4 KernSmooth_2.23.20 knitr_1.42
labeling_0.4.2 lattice_0.20-45 latticeExtra_0.6-30
lava_1.7.2 lifecycle_1.0.3 listenv_0.9.0
lme4_1.1.31 lubridate_1.9.2 magrittr_2.0.3
MASS_7.3-58.2 Matrix_1.5-3 MatrixModels_0.5-1
memoise_2.0.1 methods_4.2.2 mgcv_1.8.41
mime_0.12 minqa_1.2.5 ModelMetrics_1.2.2.2
modelr_0.1.10 multcomp_1.4-22 munsell_0.5.0
mvtnorm_1.1-3 naniar_1.0.0 nlme_3.1-162
nloptr_2.0.3 nnet_7.3-18 norm_1.0.10.0
numDeriv_2016.8.1.1 openssl_2.0.5 parallel_4.2.2
parallelly_1.34.0 pbkrtest_0.5.2 pillar_1.8.1
pkgconfig_2.0.3 pkgload_1.3.2 plyr_1.8.8
png_0.1-8 polspline_1.1.22 praise_1.0.0
prettyunits_1.1.1 pROC_1.18.0 processx_3.8.0
prodlim_2019.11.13 progress_1.2.2 progressr_0.13.0
proxy_0.4-27 ps_1.7.2 purrr_1.0.1
quantreg_5.94 R6_2.5.1 ragg_1.2.5
rappdirs_0.3.3 RColorBrewer_1.1-3 Rcpp_1.0.10
RcppEigen_0.3.3.9.3 readr_2.1.4 readxl_1.4.2
recipes_1.0.5 rematch_1.0.1 rematch2_2.1.2
reprex_2.0.2 reshape_0.8.9 reshape2_1.4.4
rlang_1.0.6 rmarkdown_2.20 rms_6.5-0
rpart_4.1.19 rprojroot_2.0.3 rstudioapi_0.14
rvest_1.0.3 sandwich_3.0-2 sass_0.4.5
scales_1.2.1 selectr_0.4.2 snakecase_0.11.0
SparseM_1.81 splines_4.2.2 SQUAREM_2021.1
stats_4.2.2 stats4_4.2.2 stringi_1.7.12
stringr_1.5.0 survival_3.5-3 svglite_2.1.1
sys_3.4.1 systemfonts_1.0.4 testthat_3.1.6
textshaping_0.3.6 TH.data_1.1-1 tibble_3.1.8
tidyr_1.3.0 tidyselect_1.2.0 tidyverse_2.0.0
timechange_0.2.0 timeDate_4022.108 tinytex_0.44
tools_4.2.2 tzdb_0.3.0 UpSetR_1.4.0
utf8_1.2.3 utils_4.2.2 uuid_1.1.0
vctrs_0.5.2 viridis_0.6.2 viridisLite_0.4.1
visdat_0.6.0 vroom_1.6.1 waldo_0.4.0
webshot_0.5.4 withr_2.5.0 xfun_0.37
xml2_1.3.3 yaml_2.3.7 zoo_1.8-11